readr::read_csv(here("data/character_list5.csv"),
progress = FALSE,
col_types = cols(
script_id = col_integer(),
imdb_character_name = col_character(),
words = col_integer(),
gender = col_character(),
age = col_character()
)) %>%
mutate(age = as.numeric(age)) -> characters_list
characters_list %>%
glimpse()
Observations: 23,048
Variables: 5
$ script_id <int> 280, 280, 280, 280, 280, 280, 280, 623, 623, 623, 623, 623, 623...
$ imdb_character_name <chr> "betty", "carolyn johnson", "eleanor", "francesca johns", "madg...
$ words <int> 311, 873, 138, 2251, 190, 723, 1908, 328, 409, 347, 2020, 366, ...
$ gender <chr> "f", "f", "f", "f", "f", "m", "m", "m", "f", "m", "m", "m", "m"...
$ age <dbl> 35, NA, NA, 46, 46, 38, 65, NA, 28, NA, 58, 53, 25, 39, 33, NA,...
readr::read_csv(here("data/meta_data7.csv"),
progress = FALSE,
col_types = cols(
script_id = col_integer(),
imdb_id = col_character(),
title = col_character(),
year = col_integer(),
gross = col_integer(),
lines_data = col_character()
)) %>%
mutate(title = iconv(title,"latin1", "UTF-8")) -> meta_data
meta_data %>%
glimpse()
Observations: 2,000
Variables: 6
$ script_id <int> 1534, 1512, 1514, 1517, 1520, 6537, 3778, 623, 1525, 6030, 625, 1509, 85...
$ imdb_id <chr> "tt1022603", "tt0147800", "tt0417385", "tt2024544", "tt1542344", "tt0450...
$ title <chr> "(500) Days of Summer", "10 Things I Hate About You", "12 and Holding", ...
$ year <int> 2009, 1999, 2005, 2013, 2010, 2007, 1992, 2001, 2009, 2013, 1968, 2009, ...
$ gross <int> 37, 65, NA, 60, 20, 91, 15, 37, 74, 80, 376, 192, 98, 204, 19, 59, 67, 3...
$ lines_data <chr> "74354452567747744433425777756577444344445644567454336755345277773423754...
left_join(characters_list,
meta_data,
by=c("script_id")) %>%
group_by(title, year) %>%
drop_na(gross) %>%
ungroup() -> scripts_data
scripts_data %>%
glimpse()
Observations: 19,387
Variables: 10
$ script_id <int> 280, 280, 280, 280, 280, 280, 280, 623, 623, 623, 623, 623, 623...
$ imdb_character_name <chr> "betty", "carolyn johnson", "eleanor", "francesca johns", "madg...
$ words <int> 311, 873, 138, 2251, 190, 723, 1908, 328, 409, 347, 2020, 366, ...
$ gender <chr> "f", "f", "f", "f", "f", "m", "m", "m", "f", "m", "m", "m", "m"...
$ age <dbl> 35, NA, NA, 46, 46, 38, 65, NA, 28, NA, 58, 53, 25, 39, 33, NA,...
$ imdb_id <chr> "tt0112579", "tt0112579", "tt0112579", "tt0112579", "tt0112579"...
$ title <chr> "The Bridges of Madison County", "The Bridges of Madison County...
$ year <int> 1995, 1995, 1995, 1995, 1995, 1995, 1995, 2001, 2001, 2001, 200...
$ gross <int> 142, 142, 142, 142, 142, 142, 142, 37, 37, 37, 37, 37, 37, 37, ...
$ lines_data <chr> "43320234343434432034334343344334343434344343443443334344434443...
scripts_data %>%
group_by(title,year) %>%
unique() %>%
ggplot(aes(x=words,
y=(..count..)/sum(..count..))) +
geom_histogram(binwidth = 250,
boundary = 0,
fill = "grey",
color = "black")
scripts_data %>%
group_by(title,year) %>%
unique() %>%
ggplot(aes(x="",
y=words)) +
geom_violin(fill="grey",
width=0.5)
scripts_data %>%
group_by(title, year) %>%
mutate(fem_prop = (sum(gender == "f") / n()),
man_prop = (1 - fem_prop)) %>%
ungroup() -> scripts_data
scripts_data %>%
select(title,
year,
fem_prop,
man_prop) %>%
sample_n(10)
scripts_data %>%
group_by(title,year) %>%
unique() %>%
ggplot(aes(x=fem_prop,
y=(..count..)/sum(..count..))) +
geom_histogram(binwidth = 0.05,
boundary = 0,
fill = "grey",
color = "black")
scripts_data %>%
group_by(title,year) %>%
unique() %>%
ggplot(aes(x="",
y=fem_prop)) +
geom_violin(fill="grey",
width=0.5)
scripts_data %>%
group_by(title,year) %>%
unique() %>%
ggplot(aes(x=year)) +
geom_bar(fill = "grey",
color = "black")
scripts_data %>%
group_by(title,year) %>%
unique() %>%
ggplot(aes(x="",
y=year)) +
geom_violin(fill="grey",
width=0.5)
scripts_data %>%
mutate(fem_words = ifelse(gender == "f",words,0),
man_words = ifelse(gender == "m",words,0)) %>%
group_by(title, year) %>%
mutate(mean_fem_words = ifelse(sum(gender == "f") == 0, 0, sum(fem_words)/sum(gender == "f")),
mean_man_words = ifelse(sum(gender == "m") == 0, 0, sum(man_words)/sum(gender == "m"))) %>%
ungroup() -> scripts_data
scripts_data %>%
select(title,
year,
mean_fem_words,
mean_man_words) %>%
sample_n(10)
scripts_data %>%
group_by(title,year) %>%
unique() %>%
filter(!mean_fem_words == 0) %>%
ggplot(aes(x=mean_fem_words,
y=(..count..)/sum(..count..))) +
geom_histogram(binwidth = 250,
boundary = 0,
fill = "grey",
color = "black")
scripts_data %>%
group_by(title,year) %>%
unique() %>%
filter(!mean_fem_words == 0) %>%
ggplot(aes(x="",
y=mean_fem_words)) +
geom_violin(fill="grey",
width=0.5)
scripts_data %>%
group_by(title,year) %>%
unique() %>%
filter(!mean_man_words == 0) %>%
ggplot(aes(x=mean_man_words,
y=(..count..)/sum(..count..))) +
geom_histogram(binwidth = 250,
boundary = 0,
fill = "grey",
color = "black")
scripts_data %>%
group_by(title,year) %>%
unique() %>%
filter(!mean_man_words == 0) %>%
ggplot(aes(x="",
y=mean_man_words)) +
geom_violin(fill="grey",
width=0.5)
scripts_data %>%
group_by(title) %>%
slice(1) %>%
unique() %>%
ungroup() %>%
select(title,
year,
gross,
fem_prop,
mean_fem_words,
mean_man_words) -> data
select(data, -title) %>%
mutate_all(funs(scale)) -> scaled_data
scaled_data %>%
sample_n(10)
A GAP compara a solução do agrupamento com cada k com a solução em um dataset onde não há estrutura de grupos.
plot_clusgap = function(clusgap, title="Gap Statistic calculation results"){
require("ggplot2")
gstab = data.frame(clusgap$Tab, k=1:nrow(clusgap$Tab))
p = ggplot(gstab, aes(k, gap)) + geom_line() + geom_point(size=5)
p = p + geom_errorbar(aes(ymax=gap+SE.sim, ymin=gap-SE.sim), width = .2)
p = p + ggtitle(title)
return(p)
}
gaps <- scaled_data %>%
clusGap(FUN = kmeans,
nstart = 20,
K.max = 8,
B = 200,
iter.max=30)
Clustering k = 1,2,..., K.max (= 8): .. done
Bootstrapping, b = 1,2,..., B (= 200) [one "." per sample]:
.................................................. 50
.................................................. 100
.................................................. 150
.................................................. 200
plot_clusgap(gaps)
set.seed(123)
# Compute and plot wss for k = 2 to k = 15.
k.max <- 15
wss <- sapply(1:k.max,
function(k){kmeans(scaled_data, k, nstart=50,iter.max = 15 )$tot.withinss})
plot(1:k.max, wss,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
data(varespec)
dis = dist(scaled_data)^2
res = kmeans(scaled_data,3)
sil = silhouette (res$cluster, dis)
plot(sil)
fitting ...
|
| | 0%
|
|= | 1%
|
|== | 2%
|
|== | 3%
|
|=== | 3%
|
|=== | 4%
|
|==== | 4%
|
|==== | 5%
|
|===== | 6%
|
|====== | 7%
|
|======= | 8%
|
|======= | 9%
|
|======== | 9%
|
|========= | 10%
|
|========= | 11%
|
|========== | 11%
|
|========== | 12%
|
|=========== | 12%
|
|=========== | 13%
|
|============ | 14%
|
|============= | 15%
|
|============= | 16%
|
|============== | 16%
|
|============== | 17%
|
|=============== | 17%
|
|=============== | 18%
|
|================ | 18%
|
|================ | 19%
|
|================= | 19%
|
|================= | 20%
|
|================== | 20%
|
|================== | 21%
|
|=================== | 22%
|
|==================== | 23%
|
|==================== | 24%
|
|===================== | 24%
|
|===================== | 25%
|
|====================== | 25%
|
|====================== | 26%
|
|======================= | 27%
|
|======================== | 27%
|
|======================== | 28%
|
|========================= | 29%
|
|========================== | 30%
|
|========================== | 31%
|
|=========================== | 31%
|
|=========================== | 32%
|
|============================ | 32%
|
|============================ | 33%
|
|============================= | 33%
|
|============================= | 34%
|
|============================== | 35%
|
|=============================== | 36%
|
|================================ | 37%
|
|================================= | 38%
|
|================================= | 39%
|
|================================== | 39%
|
|================================== | 40%
|
|=================================== | 40%
|
|=================================== | 41%
|
|==================================== | 42%
|
|===================================== | 43%
|
|===================================== | 44%
|
|====================================== | 44%
|
|====================================== | 45%
|
|======================================= | 45%
|
|======================================== | 46%
|
|======================================== | 47%
|
|========================================= | 47%
|
|========================================= | 48%
|
|========================================== | 48%
|
|========================================== | 49%
|
|=========================================== | 50%
|
|============================================ | 51%
|
|============================================ | 52%
|
|============================================= | 52%
|
|============================================= | 53%
|
|============================================== | 53%
|
|============================================== | 54%
|
|=============================================== | 55%
|
|================================================ | 55%
|
|================================================ | 56%
|
|================================================= | 56%
|
|================================================= | 57%
|
|================================================== | 58%
|
|=================================================== | 59%
|
|=================================================== | 60%
|
|==================================================== | 60%
|
|==================================================== | 61%
|
|===================================================== | 61%
|
|===================================================== | 62%
|
|====================================================== | 63%
|
|======================================================= | 64%
|
|======================================================== | 65%
|
|========================================================= | 66%
|
|========================================================= | 67%
|
|========================================================== | 67%
|
|========================================================== | 68%
|
|=========================================================== | 68%
|
|=========================================================== | 69%
|
|============================================================ | 69%
|
|============================================================ | 70%
|
|============================================================= | 71%
|
|============================================================== | 72%
|
|============================================================== | 73%
|
|=============================================================== | 73%
|
|================================================================ | 74%
|
|================================================================ | 75%
|
|================================================================= | 75%
|
|================================================================= | 76%
|
|================================================================== | 76%
|
|================================================================== | 77%
|
|=================================================================== | 78%
|
|==================================================================== | 79%
|
|==================================================================== | 80%
|
|===================================================================== | 80%
|
|===================================================================== | 81%
|
|====================================================================== | 81%
|
|====================================================================== | 82%
|
|======================================================================= | 82%
|
|======================================================================= | 83%
|
|======================================================================== | 83%
|
|======================================================================== | 84%
|
|========================================================================= | 84%
|
|========================================================================= | 85%
|
|========================================================================== | 86%
|
|=========================================================================== | 87%
|
|=========================================================================== | 88%
|
|============================================================================ | 88%
|
|============================================================================ | 89%
|
|============================================================================= | 89%
|
|============================================================================= | 90%
|
|============================================================================== | 91%
|
|=============================================================================== | 91%
|
|=============================================================================== | 92%
|
|================================================================================ | 93%
|
|================================================================================= | 94%
|
|================================================================================== | 95%
|
|================================================================================== | 96%
|
|=================================================================================== | 96%
|
|=================================================================================== | 97%
|
|==================================================================================== | 97%
|
|==================================================================================== | 98%
|
|===================================================================================== | 99%
|
|======================================================================================| 100%
Bayesian Information Criterion (BIC):
EII VII EEI VEI EVI VVI EEE EVE VEE
1 -23579.67 -23579.67 -23609.33 -23609.33 -23609.33 -23609.33 -23225.17 -23225.17 -23225.17
2 -23003.06 -21804.34 -22817.07 -21478.61 -21697.14 -21003.77 -22612.74 -22177.55 -21270.14
3 -22333.06 -21499.75 -22050.19 -21180.85 -20945.48 -20246.23 -21954.22 -20896.38 -21029.00
4 -22199.58 -21013.42 -22009.32 -20786.64 -20431.48 -19889.96 -21911.79 -20369.67 -20606.10
5 -22077.64 -20741.44 -22053.58 -20606.76 -20260.04 -19533.57 -21856.56 -20218.54 -20453.28
6 -21656.44 -20592.06 -21247.73 -20333.25 -20030.88 -19240.93 -21466.36 -20076.60 -20290.03
7 -21417.12 -20516.17 -21088.10 -20229.80 -19955.50 -19227.16 -21360.52 -19927.95 -20190.00
8 -21317.61 -20460.35 -21056.62 -20082.06 -19901.24 -19202.16 -21405.05 -19975.12 -20099.36
9 -21307.44 -20342.06 -20774.10 -19960.91 -19839.67 -19201.85 -21451.38 -19752.71 -20078.18
10 -21080.55 -20239.96 -20795.92 -19935.03 -19804.49 -19151.79 -21092.48 -19734.68 -19928.18
11 -21078.17 -20232.61 -20887.50 -19951.73 -19743.12 -19172.48 -21130.63 -19710.75 -19939.65
12 -20994.56 -20232.40 -20918.43 -19959.96 -19816.31 -19185.72 -20895.85 NA -19943.95
13 -21038.34 -20220.85 -20962.83 -19974.57 -19806.74 -19207.07 -20940.36 -19742.23 -19955.80
14 -21081.78 -20222.26 -21007.03 -19933.00 -19758.87 -19181.25 -20983.11 -19820.39 -19901.33
15 -21099.73 -20182.05 -20858.12 -19960.05 -19780.73 NA -20930.24 -19642.84 -19918.94
VVE EEV VEV EVV VVV
1 -23225.17 -23225.17 -23225.17 -23225.17 -23225.17
2 -21089.81 -21784.65 -20718.89 -21896.66 -20688.97
3 -20558.99 -21170.21 -20388.35 -20838.19 -20094.89
4 -20233.69 -21135.54 -20100.22 -20610.94 -19885.04
5 -20166.72 -20549.56 -19929.24 -20307.86 -19624.60
6 -19999.78 -20493.46 -19718.33 -20156.43 -19540.47
7 -20031.88 -20452.91 -19744.75 -20161.84 -19544.93
8 -19936.39 -20408.82 -19641.23 -20168.39 -19518.19
9 -19516.33 -20236.97 -19651.90 -20173.69 -19519.13
10 NA -20217.81 -19655.67 -20208.62 -19577.38
11 NA -20315.81 -19683.64 -20226.75 -19654.89
12 -19192.63 -20240.98 -19756.07 -20295.14 NA
13 -19155.69 -20358.12 -19783.59 -20230.65 -19763.31
14 -19271.62 -20230.29 -19765.60 -20266.80 -19852.96
15 -19198.36 -20372.79 -19849.51 -20351.47 -19925.35
Top 3 models based on the BIC criterion:
VVI,10 VVE,13 VVI,11
-19151.79 -19155.69 -19172.48
plot(d_clust$BIC)
nb <- NbClust(scaled_data, diss=NULL, distance = "euclidean",
min.nc=2, max.nc=5, method = "kmeans",
index = "all", alphaBeale = 0.1)
*** : The Hubert index is a graphical method of determining the number of clusters.
In the plot of Hubert index, we seek a significant knee that corresponds to a
significant increase of the value of the measure i.e the significant peak in Hubert
index second differences plot.
*** : The D index is a graphical method of determining the number of clusters.
In the plot of D index, we seek a significant knee (the significant peak in Dindex
second differences plot) that corresponds to a significant increase of the value of
the measure.
*******************************************************************
* Among all indices:
* 7 proposed 2 as the best number of clusters
* 9 proposed 3 as the best number of clusters
* 3 proposed 4 as the best number of clusters
* 5 proposed 5 as the best number of clusters
***** Conclusion *****
* According to the majority rule, the best number of clusters is 3
*******************************************************************
hist(nb$Best.nc[1,], breaks = max(na.omit(nb$Best.nc[1,])))
# toclust = x %>%
# rownames_to_column(var = "language") %>%
# select(1:5)
# dists = toclust %>%
# select(-language) %>%
# dist() # só para plotar silhouetas depois
# km = toclust %>%
# select(-language) %>%
# kmeans(centers = n_clusters, nstart = 20)
# km %>%
# augment(toclust) %>%
# gather(key = "variável", value = "valor", -language, -.cluster) %>%
# ggplot(aes(x = `variável`, y = valor, group = language, colour = .cluster)) +
# geom_point(alpha = 0.2) +
# geom_line(alpha = .5) +
# facet_wrap(~ .cluster)
library(GGally)
Attaching package: ‘GGally’
The following object is masked from ‘package:dplyr’:
nasa
data(crabs, package = "MASS")
ggparcoord(crabs, columns = 4:8, groupColumn = "sp")
library(GGally)
ggparcoord(scaled_data)
library(lattice)
parallelplot(scaled_data)
scaled_data %>%
kmeans(3, nstart=100) -> km
p <- autoplot(km, data=scaled_data, frame = TRUE)
ggplotly(p)
n_clusters = 3
row.names(scaled_data) <- data$title
Setting row names on a tibble is deprecated.
toclust <- scaled_data %>%
rownames_to_column(var = "title")
km = toclust %>%
select(-title) %>%
kmeans(centers = n_clusters, nstart = 20)
km %>%
augment(toclust) %>%
gather(key = "variável", value = "valor", -title, -.cluster) %>%
ggplot(aes(x = `variável`, y = valor, group = title, colour = .cluster)) +
geom_point(alpha = 0.2) +
geom_line(alpha = .5) +
facet_wrap(~ .cluster) +
coord_flip()
attributes are not identical across measure variables;
they will be dropped